In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime
In [2]:
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class
import pandas as pd
In [3]:
"""
# v4_annotated
user_blocked = [
'annotated_onion_layer_5_rows_0_to_5000_raters_20',
'annotated_onion_layer_5_rows_0_to_10000',
'annotated_onion_layer_5_rows_0_to_10000_raters_3',
'annotated_onion_layer_5_rows_10000_to_50526_raters_10',
'annotated_onion_layer_10_rows_0_to_1000',
'annotated_onion_layer_20_rows_0_to_1000',
'annotated_onion_layer_30_rows_0_to_1000',
]
user_random = [
'annotated_random_data_rows_0_to_5000_raters_20',
'annotated_random_data_rows_5000_to_10000',
'annotated_random_data_rows_5000_to_10000_raters_3',
'annotated_random_data_rows_10000_to_20000_raters_10',
]
article_blocked = ['article_onion_layer_5_all_rows_raters_10',]
article_random = ['article_random_data_all_rows_raters_10',]
"""
user_blocked = [
'user_blocked',
'user_blocked_2',
'user_blocked_3',
'user_blocked_4',
'user_blocked_layer_10',
'user_blocked_layer_20',
'user_blocked_layer_30',
]
user_random = [
'user_random',
'user_random_2',
'user_random_3',
'user_random_4',
'user_random_extra_baselines',
]
article_blocked = [ 'article_blocked',
'article_blocked_layer_5_extra_baselines' ]
article_random = ['article_random',
'article_random_extra_baselines']
files = {
'user': {'blocked': user_blocked, 'random': user_random},
'article': {'blocked': article_blocked, 'random': article_random}
}
dfs = []
for ns, d in files.items():
for sample, files in d.items():
for f in files:
df = pd.read_csv('../../data/annotations/raw/%s/%s.csv' % (ns,f))
df['src'] = f
df['ns'] = ns
df['sample'] = sample
dfs.append(df)
df = pd.concat(dfs)
print('# annotations: ', df.shape[0])
In [4]:
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()
Out[4]:
In [5]:
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()
In [6]:
df.sample_count.value_counts()
Out[6]:
In [7]:
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'
In [8]:
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()
Out[8]:
In [9]:
del df.sample_count
In [10]:
print('# annotations: ', df.shape[0])
In [11]:
df = tidy_labels(df)
In [12]:
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)
In [13]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])
In [14]:
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])
In [15]:
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])
In [16]:
df['aggression_score'].value_counts(dropna=False)
Out[16]:
In [17]:
df['is_harassment_or_attack'].value_counts(dropna=False)
Out[17]:
In [18]:
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
print('# annotations: ', df.shape[0])
In [19]:
# remove all annotations from users who are ambivalent in 10% or more of revisions
# we consider these users unreliable
def ambivalent(s):
return 'not_attack' in s and s!= 'not_attack'
df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
print('# annotations: ', df.shape[0])
In [20]:
# remove all other ambivalent annotations
df = df.query('ambivalent==False')
print('# annotations: ', df.shape[0])
In [21]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()
Out[21]:
In [22]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])
In [23]:
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])
In [24]:
u_comments = comments.drop_duplicates(subset = ['clean_diff'])
print(u_comments.shape[0])
In [25]:
comments[comments.duplicated(subset = ['clean_diff'])].head(5)
Out[25]:
In [26]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])
In [27]:
df['recipient'].value_counts(dropna=False)
Out[27]:
In [28]:
df['attack'].value_counts(dropna=False)
Out[28]:
In [29]:
df['aggression'].value_counts(dropna=False)
Out[29]:
In [30]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index
In [31]:
counts.shape
Out[31]:
In [32]:
counts['n'].value_counts().head()
Out[32]:
In [33]:
counts_enough = counts.query("n>=8")
In [34]:
counts_enough.shape
Out[34]:
In [35]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])
In [50]:
df.columns
Out[50]:
In [36]:
cols = ['rev_id', '_worker_id', 'ns', 'sample', 'src','clean_diff', 'diff', 'insert_only', 'page_id',
'page_title', 'rev_comment', 'rev_timestamp',
'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
'third_party', 'attack', 'aggression', 'aggression_score']
df = df[cols]
In [41]:
df.groupby(['ns', 'sample']).size()
Out[41]:
In [42]:
df.to_csv('../../data/annotations/clean/annotations.tsv', index=False, sep='\t')
In [43]:
pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t').shape
Out[43]: